# Emulating example 14.3.8 in HTF: K-means clustering wtih Human Tumor Microarray Data # Jill McCracken # September 18, 2009 cancerdat <- read.csv ("C:/Documents and Settings/m29287/My Documents/Analytics/Statistics/HTF -- Summer 2009/Clustering example/cancer_dataT.csv", header = TRUE) dim(cancerdat) # cancerdat contains both the train / test data sets from HTF # It has been TRANSPOSED prior to loading into R # note that cancerdat has 198 rows (cancer samples) and 16066 columns # the first three columns indicate cancer type and sample from HTF website # Columns 4 - 16066 are the different genes, our clustering variables cancerdatall <- cancerdat[,4:16066] types <- cancerdat[,1:3] # scale the data prior to clustering scale(cancerdatall) # set the seed so we can reproduce results (so kmeans will pick the same initial cluster centers) set.seed(123) # start with clustering all of the data, k = 4 cl <- kmeans(cancerdatall,4) ssall <- sum(cl$withinss) ssall # now try clustering all of the data with k = 2 through 9 clk <- matrix(0,9,2) for (i in 1:9) { clkclus <- kmeans(cancerdatall,i+1) clk[i,1] <- i clk[i,2] <- sum(clkclus$withinss) } clres <- cbind(types,cl$cluster) table(clres$type, clres[,4]) # split the data into three equal sized groups of genes and compare results for k = 3 cancerdat1 <- cancerdat[,4:5357] cancerdat2 <- cancerdat[,5358:10712] cancerdat3 <- cancerdat[,10713:16066] scale(cancerdat1) scale(cancerdat2) scale(cancerdat3) cl1 <- kmeans(cancerdat1, 3) cl2 <- kmeans(cancerdat2, 3) cl3 <- kmeans(cancerdat3, 3) ss1 = sum(cl1$withinss) ss2 = sum(cl2$withinss) ss3 = sum(cl3$withinss) ss1 ss2 ss3 cl1$size cl2$size cl3$size cl1res <- cbind(types,cl1$cluster) cl2res <- cbind(types,cl2$cluster) cl3res <- cbind(types,cl3$cluster) table(cl1res$type,cl1res[,4]) table(cl2res$type,cl2res[,4]) table(cl3res$type,cl3res[,4]) # cancerdat3 has the lowest WithinSS and good separation of the cancer types, so let's try that data set for different values of k cl3k <- matrix(0,9,2) for (i in 1:9) { cl3kclus <- kmeans(cancerdat3,i+1) cl3k[i,1] <- i+1 cl3k[i,2] <- sum(cl3kclus$withinss) } plot(cl3k[,1],cl3k[,2],xlab="k",ylab="TotWithinSS",col="red") cl3k8clus <- kmeans(cancerdat3,8) cl3k8clus$size cl3k8res <- cbind(types,cl3k8clus$cluster) table(cl3k8res$type,cl3k8res[,4]) # Look at results for 14 clusters cl3k14clus <- kmeans(cancerdatall,14) cl3k14clus$size cl3k14res <- cbind(types,cl3k14clus$cluster) table(cl3k14res$type,cl3k14res[,4])